經過 Day 26 的 WAF 與 Shield 安全防護建置,我們已經為 Kyo System 建立了完整的應用層防護。今天我們要實作 CloudWatch 監控與告警自動化系統。在生產環境中,監控不僅是發現問題,更重要的是預防問題、自動化回應、以及持續優化。我們需要建立完整的可觀測性(Observability)架構:指標(Metrics)、日誌(Logs)、追蹤(Traces)。
/**
* CloudWatch 監控架構
*
* ┌──────────────────────────────────────────────┐
* │ Observability Architecture (3 Pillars) │
* └──────────────────────────────────────────────┘
*
* 1. Metrics (指標) - What happened?
* ┌─────────────────────────────────────┐
* │ CloudWatch Metrics │
* │ ├─ System Metrics │
* │ │ ├─ CPU Utilization │
* │ │ ├─ Memory Usage │
* │ │ ├─ Disk I/O │
* │ │ └─ Network Traffic │
* │ ├─ Application Metrics │
* │ │ ├─ Request Count │
* │ │ ├─ Response Time │
* │ │ ├─ Error Rate │
* │ │ └─ Active Users │
* │ └─ Business Metrics │
* │ ├─ OTP Sent │
* │ ├─ OTP Verified │
* │ └─ Revenue │
* └─────────────────────────────────────┘
*
* 2. Logs (日誌) - Why it happened?
* ┌─────────────────────────────────────┐
* │ CloudWatch Logs │
* │ ├─ Application Logs │
* │ │ └─ Pino JSON logs │
* │ ├─ Access Logs │
* │ │ ├─ ALB Access Logs │
* │ │ └─ CloudFront Logs │
* │ ├─ Error Logs │
* │ │ └─ Stack traces │
* │ └─ Audit Logs │
* │ └─ Security events │
* └─────────────────────────────────────┘
*
* 3. Traces (追蹤) - How it happened?
* ┌─────────────────────────────────────┐
* │ AWS X-Ray │
* │ ├─ Request Flow │
* │ │ ├─ ALB → ECS → RDS │
* │ │ └─ Latency breakdown │
* │ ├─ Service Map │
* │ │ └─ Dependencies │
* │ └─ Bottleneck Detection │
* │ └─ Slow queries │
* └─────────────────────────────────────┘
*
* Alerting & Auto-remediation
* ┌─────────────────────────────────────┐
* │ CloudWatch Alarms │
* │ ├─ Threshold-based │
* │ ├─ Anomaly Detection │
* │ └─ Composite Alarms │
* │ ↓ │
* │ SNS Topics │
* │ ├─ Email │
* │ ├─ Slack │
* │ └─ PagerDuty │
* │ ↓ │
* │ Lambda Auto-remediation │
* │ ├─ Restart Service │
* │ ├─ Scale Up │
* │ └─ Clear Cache │
* └─────────────────────────────────────┘
*
* 成本優化:
* - Metrics: $0.30 per custom metric/month
* - Logs: $0.50 per GB ingested
* - Insights queries: $0.005 per GB scanned
* - Alarms: $0.10 per alarm/month
* - X-Ray: $5 per 1M traces recorded
*/
// infrastructure/lib/monitoring-stack.ts
import * as cdk from 'aws-cdk-lib';
import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch';
import * as sns from 'aws-cdk-lib/aws-sns';
import * as subscriptions from 'aws-cdk-lib/aws-sns-subscriptions';
import * as lambda from 'aws-cdk-lib/aws-lambda';
import * as events from 'aws-cdk-lib/aws-events';
import * as targets from 'aws-cdk-lib/aws-events-targets';
import * as iam from 'aws-cdk-lib/aws-iam';
import * as actions from 'aws-cdk-lib/aws-cloudwatch-actions';
import { Construct } from 'constructs';
export interface MonitoringStackProps extends cdk.StackProps {
ecsClusterName: string;
ecsServiceName: string;
albTargetGroupArn: string;
rdsClusterIdentifier: string;
elastiCacheClusterId: string;
notificationEmail: string;
slackWebhookUrl?: string;
}
export class MonitoringStack extends cdk.Stack {
public readonly alarmTopic: sns.Topic;
constructor(scope: Construct, id: string, props: MonitoringStackProps) {
super(scope, id, props);
/**
* SNS Topic for Alerts
*/
this.alarmTopic = new sns.Topic(this, 'AlarmTopic', {
displayName: 'Kyo System Alarms',
topicName: 'kyo-alarms',
});
// Email subscription
this.alarmTopic.addSubscription(
new subscriptions.EmailSubscription(props.notificationEmail)
);
// Slack subscription (if webhook provided)
if (props.slackWebhookUrl) {
const slackNotifier = this.createSlackNotifier(props.slackWebhookUrl);
this.alarmTopic.addSubscription(
new subscriptions.LambdaSubscription(slackNotifier)
);
}
/**
* ECS Service Metrics & Alarms
*/
this.createECSMetrics(props);
/**
* ALB Metrics & Alarms
*/
this.createALBMetrics(props);
/**
* RDS Metrics & Alarms
*/
this.createRDSMetrics(props);
/**
* ElastiCache Metrics & Alarms
*/
this.createElastiCacheMetrics(props);
/**
* Application Custom Metrics
*/
this.createCustomMetrics();
/**
* Composite Alarms
*/
this.createCompositeAlarms();
/**
* CloudWatch Dashboard
*/
this.createDashboard(props);
/**
* Auto-remediation Lambdas
*/
this.createAutoRemediationLambdas(props);
}
/**
* ECS 服務指標與告警
*/
private createECSMetrics(props: MonitoringStackProps) {
const namespace = 'AWS/ECS';
// CPU 使用率
const cpuMetric = new cloudwatch.Metric({
namespace,
metricName: 'CPUUtilization',
dimensionsMap: {
ClusterName: props.ecsClusterName,
ServiceName: props.ecsServiceName,
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
});
new cloudwatch.Alarm(this, 'ECSHighCPUAlarm', {
alarmName: 'kyo-ecs-high-cpu',
metric: cpuMetric,
threshold: 80, // 80%
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
actionsEnabled: true,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// Memory 使用率
const memoryMetric = new cloudwatch.Metric({
namespace,
metricName: 'MemoryUtilization',
dimensionsMap: {
ClusterName: props.ecsClusterName,
ServiceName: props.ecsServiceName,
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
});
new cloudwatch.Alarm(this, 'ECSHighMemoryAlarm', {
alarmName: 'kyo-ecs-high-memory',
metric: memoryMetric,
threshold: 85, // 85%
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// Task 數量
const taskCountMetric = new cloudwatch.Metric({
namespace,
metricName: 'RunningTaskCount',
dimensionsMap: {
ClusterName: props.ecsClusterName,
ServiceName: props.ecsServiceName,
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
});
// Task 數量過低告警
new cloudwatch.Alarm(this, 'ECSLowTaskCountAlarm', {
alarmName: 'kyo-ecs-low-task-count',
metric: taskCountMetric,
threshold: 2, // 至少 2 個 task
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
}
/**
* ALB 指標與告警
*/
private createALBMetrics(props: MonitoringStackProps) {
const namespace = 'AWS/ApplicationELB';
// Target 健康檢查
const healthyHostMetric = new cloudwatch.Metric({
namespace,
metricName: 'HealthyHostCount',
dimensionsMap: {
TargetGroup: props.albTargetGroupArn.split(':').pop()!,
LoadBalancer: props.albTargetGroupArn.split('/')[1],
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
});
new cloudwatch.Alarm(this, 'ALBUnhealthyTargetAlarm', {
alarmName: 'kyo-alb-unhealthy-target',
metric: healthyHostMetric,
threshold: 1,
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// 回應時間
const responseTimeMetric = new cloudwatch.Metric({
namespace,
metricName: 'TargetResponseTime',
dimensionsMap: {
TargetGroup: props.albTargetGroupArn.split(':').pop()!,
LoadBalancer: props.albTargetGroupArn.split('/')[1],
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
});
new cloudwatch.Alarm(this, 'ALBHighLatencyAlarm', {
alarmName: 'kyo-alb-high-latency',
metric: responseTimeMetric,
threshold: 1, // 1 秒
evaluationPeriods: 3,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// 5xx 錯誤率
const error5xxMetric = new cloudwatch.Metric({
namespace,
metricName: 'HTTPCode_Target_5XX_Count',
dimensionsMap: {
TargetGroup: props.albTargetGroupArn.split(':').pop()!,
LoadBalancer: props.albTargetGroupArn.split('/')[1],
},
statistic: 'Sum',
period: cdk.Duration.minutes(5),
});
new cloudwatch.Alarm(this, 'ALBHigh5xxAlarm', {
alarmName: 'kyo-alb-high-5xx',
metric: error5xxMetric,
threshold: 10, // 5 分鐘內超過 10 個 5xx
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
}
/**
* RDS 指標與告警
*/
private createRDSMetrics(props: MonitoringStackProps) {
const namespace = 'AWS/RDS';
// CPU 使用率
const cpuMetric = new cloudwatch.Metric({
namespace,
metricName: 'CPUUtilization',
dimensionsMap: {
DBClusterIdentifier: props.rdsClusterIdentifier,
},
statistic: 'Average',
period: cdk.Duration.minutes(5),
});
new cloudwatch.Alarm(this, 'RDSHighCPUAlarm', {
alarmName: 'kyo-rds-high-cpu',
metric: cpuMetric,
threshold: 80,
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// 連線數
const connectionsMetric = new cloudwatch.Metric({
namespace,
metricName: 'DatabaseConnections',
dimensionsMap: {
DBClusterIdentifier: props.rdsClusterIdentifier,
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
});
new cloudwatch.Alarm(this, 'RDSHighConnectionsAlarm', {
alarmName: 'kyo-rds-high-connections',
metric: connectionsMetric,
threshold: 80, // 假設 max_connections = 100
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// 複製延遲 (Replica Lag)
const replicaLagMetric = new cloudwatch.Metric({
namespace,
metricName: 'AuroraReplicaLag',
dimensionsMap: {
DBClusterIdentifier: props.rdsClusterIdentifier,
},
statistic: 'Maximum',
period: cdk.Duration.minutes(1),
});
new cloudwatch.Alarm(this, 'RDSHighReplicaLagAlarm', {
alarmName: 'kyo-rds-high-replica-lag',
metric: replicaLagMetric,
threshold: 1000, // 1 秒 = 1000ms
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
}
/**
* ElastiCache 指標與告警
*/
private createElastiCacheMetrics(props: MonitoringStackProps) {
const namespace = 'AWS/ElastiCache';
// CPU 使用率
const cpuMetric = new cloudwatch.Metric({
namespace,
metricName: 'CPUUtilization',
dimensionsMap: {
CacheClusterId: props.elastiCacheClusterId,
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
});
new cloudwatch.Alarm(this, 'RedisHighCPUAlarm', {
alarmName: 'kyo-redis-high-cpu',
metric: cpuMetric,
threshold: 75, // Redis 單執行緒,75% 就需要關注
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// Memory 使用率
const memoryMetric = new cloudwatch.Metric({
namespace,
metricName: 'DatabaseMemoryUsagePercentage',
dimensionsMap: {
CacheClusterId: props.elastiCacheClusterId,
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
});
new cloudwatch.Alarm(this, 'RedisHighMemoryAlarm', {
alarmName: 'kyo-redis-high-memory',
metric: memoryMetric,
threshold: 80,
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// Evictions (逐出)
const evictionsMetric = new cloudwatch.Metric({
namespace,
metricName: 'Evictions',
dimensionsMap: {
CacheClusterId: props.elastiCacheClusterId,
},
statistic: 'Sum',
period: cdk.Duration.minutes(5),
});
new cloudwatch.Alarm(this, 'RedisEvictionsAlarm', {
alarmName: 'kyo-redis-evictions',
metric: evictionsMetric,
threshold: 100, // 5 分鐘內超過 100 次逐出
evaluationPeriods: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
}
/**
* 應用自訂指標
*/
private createCustomMetrics() {
const namespace = 'Kyo/Application';
// OTP 發送成功率
const otpSuccessRateMetric = new cloudwatch.MathExpression({
expression: '(success / (success + failure)) * 100',
usingMetrics: {
success: new cloudwatch.Metric({
namespace,
metricName: 'OTPSendSuccess',
statistic: 'Sum',
period: cdk.Duration.minutes(5),
}),
failure: new cloudwatch.Metric({
namespace,
metricName: 'OTPSendFailure',
statistic: 'Sum',
period: cdk.Duration.minutes(5),
}),
},
period: cdk.Duration.minutes(5),
});
new cloudwatch.Alarm(this, 'OTPLowSuccessRateAlarm', {
alarmName: 'kyo-otp-low-success-rate',
metric: otpSuccessRateMetric,
threshold: 95, // 低於 95%
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
// API 回應時間 P99
const apiLatencyMetric = new cloudwatch.Metric({
namespace,
metricName: 'APILatency',
statistic: 'p99',
period: cdk.Duration.minutes(5),
});
new cloudwatch.Alarm(this, 'APIHighP99LatencyAlarm', {
alarmName: 'kyo-api-high-p99-latency',
metric: apiLatencyMetric,
threshold: 500, // 500ms
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
}
/**
* 複合告警 (Composite Alarms)
*/
private createCompositeAlarms() {
// 建立子告警
const highCPUAlarm = cloudwatch.Alarm.fromAlarmArn(
this,
'ImportedHighCPU',
`arn:aws:cloudwatch:${this.region}:${this.account}:alarm:kyo-ecs-high-cpu`
);
const highMemoryAlarm = cloudwatch.Alarm.fromAlarmArn(
this,
'ImportedHighMemory',
`arn:aws:cloudwatch:${this.region}:${this.account}:alarm:kyo-ecs-high-memory`
);
const high5xxAlarm = cloudwatch.Alarm.fromAlarmArn(
this,
'ImportedHigh5xx',
`arn:aws:cloudwatch:${this.region}:${this.account}:alarm:kyo-alb-high-5xx`
);
// 系統健康度複合告警
new cloudwatch.CompositeAlarm(this, 'SystemHealthAlarm', {
alarmName: 'kyo-system-health',
compositeAlarmName: 'kyo-system-health',
alarmDescription: 'System is unhealthy when multiple alarms trigger',
// 當 CPU 和 Memory 都高,或者 5xx 錯誤率高時觸發
alarmRule: cloudwatch.AlarmRule.anyOf(
cloudwatch.AlarmRule.allOf(
cloudwatch.AlarmRule.fromAlarm(highCPUAlarm, cloudwatch.AlarmState.ALARM),
cloudwatch.AlarmRule.fromAlarm(highMemoryAlarm, cloudwatch.AlarmState.ALARM)
),
cloudwatch.AlarmRule.fromAlarm(high5xxAlarm, cloudwatch.AlarmState.ALARM)
),
actionsEnabled: true,
}).addAlarmAction(new actions.SnsAction(this.alarmTopic));
}
/**
* CloudWatch Dashboard
*/
private createDashboard(props: MonitoringStackProps) {
const dashboard = new cloudwatch.Dashboard(this, 'KyoDashboard', {
dashboardName: 'kyo-system-dashboard',
});
// ECS 指標
dashboard.addWidgets(
new cloudwatch.GraphWidget({
title: 'ECS CPU & Memory',
left: [
new cloudwatch.Metric({
namespace: 'AWS/ECS',
metricName: 'CPUUtilization',
dimensionsMap: {
ClusterName: props.ecsClusterName,
ServiceName: props.ecsServiceName,
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
label: 'CPU %',
color: cloudwatch.Color.BLUE,
}),
],
right: [
new cloudwatch.Metric({
namespace: 'AWS/ECS',
metricName: 'MemoryUtilization',
dimensionsMap: {
ClusterName: props.ecsClusterName,
ServiceName: props.ecsServiceName,
},
statistic: 'Average',
period: cdk.Duration.minutes(1),
label: 'Memory %',
color: cloudwatch.Color.GREEN,
}),
],
width: 12,
})
);
// ALB 指標
dashboard.addWidgets(
new cloudwatch.GraphWidget({
title: 'ALB Requests & Errors',
left: [
new cloudwatch.Metric({
namespace: 'AWS/ApplicationELB',
metricName: 'RequestCount',
statistic: 'Sum',
period: cdk.Duration.minutes(1),
label: 'Requests',
color: cloudwatch.Color.BLUE,
}),
],
right: [
new cloudwatch.Metric({
namespace: 'AWS/ApplicationELB',
metricName: 'HTTPCode_Target_5XX_Count',
statistic: 'Sum',
period: cdk.Duration.minutes(1),
label: '5xx Errors',
color: cloudwatch.Color.RED,
}),
new cloudwatch.Metric({
namespace: 'AWS/ApplicationELB',
metricName: 'HTTPCode_Target_4XX_Count',
statistic: 'Sum',
period: cdk.Duration.minutes(1),
label: '4xx Errors',
color: cloudwatch.Color.ORANGE,
}),
],
width: 12,
})
);
// 自訂業務指標
dashboard.addWidgets(
new cloudwatch.GraphWidget({
title: 'OTP Metrics',
left: [
new cloudwatch.Metric({
namespace: 'Kyo/Application',
metricName: 'OTPSendSuccess',
statistic: 'Sum',
period: cdk.Duration.minutes(5),
label: 'Sent',
color: cloudwatch.Color.GREEN,
}),
new cloudwatch.Metric({
namespace: 'Kyo/Application',
metricName: 'OTPVerifySuccess',
statistic: 'Sum',
period: cdk.Duration.minutes(5),
label: 'Verified',
color: cloudwatch.Color.BLUE,
}),
],
right: [
new cloudwatch.Metric({
namespace: 'Kyo/Application',
metricName: 'OTPSendFailure',
statistic: 'Sum',
period: cdk.Duration.minutes(5),
label: 'Failures',
color: cloudwatch.Color.RED,
}),
],
width: 12,
})
);
}
/**
* Slack 通知 Lambda
*/
private createSlackNotifier(webhookUrl: string): lambda.Function {
return new lambda.Function(this, 'SlackNotifier', {
functionName: 'kyo-slack-notifier',
runtime: lambda.Runtime.NODEJS_18_X,
handler: 'index.handler',
code: lambda.Code.fromInline(`
const https = require('https');
const url = require('url');
exports.handler = async (event) => {
const message = JSON.parse(event.Records[0].Sns.Message);
const alarmName = message.AlarmName;
const newState = message.NewStateValue;
const reason = message.NewStateReason;
const slackMessage = {
text: \`*\${newState}*: \${alarmName}\`,
attachments: [{
color: newState === 'ALARM' ? 'danger' : 'good',
fields: [
{ title: 'Alarm', value: alarmName, short: true },
{ title: 'State', value: newState, short: true },
{ title: 'Reason', value: reason, short: false },
],
footer: 'AWS CloudWatch',
ts: Math.floor(Date.now() / 1000),
}],
};
const webhookUrl = '${webhookUrl}';
const parsedUrl = url.parse(webhookUrl);
return new Promise((resolve, reject) => {
const req = https.request({
hostname: parsedUrl.hostname,
path: parsedUrl.path,
method: 'POST',
headers: { 'Content-Type': 'application/json' },
}, (res) => {
resolve({ statusCode: res.statusCode });
});
req.on('error', reject);
req.write(JSON.stringify(slackMessage));
req.end();
});
};
`),
environment: {
SLACK_WEBHOOK_URL: webhookUrl,
},
});
}
/**
* 自動修復 Lambda
*/
private createAutoRemediationLambdas(props: MonitoringStackProps) {
// ECS Service 重啟 Lambda
const restartServiceLambda = new lambda.Function(this, 'RestartServiceLambda', {
functionName: 'kyo-restart-ecs-service',
runtime: lambda.Runtime.NODEJS_18_X,
handler: 'index.handler',
code: lambda.Code.fromInline(`
const { ECSClient, UpdateServiceCommand } = require('@aws-sdk/client-ecs');
const ecs = new ECSClient({});
exports.handler = async (event) => {
console.log('Event:', JSON.stringify(event, null, 2));
const message = JSON.parse(event.Records[0].Sns.Message);
const alarmName = message.AlarmName;
// 只處理特定告警
if (alarmName !== 'kyo-system-health') {
console.log('Ignoring alarm:', alarmName);
return;
}
try {
// 強制重新部署 (會重啟所有 tasks)
await ecs.send(new UpdateServiceCommand({
cluster: '${props.ecsClusterName}',
service: '${props.ecsServiceName}',
forceNewDeployment: true,
}));
console.log('Service restart triggered');
return { statusCode: 200, body: 'Service restarted' };
} catch (error) {
console.error('Failed to restart service:', error);
throw error;
}
};
`),
timeout: cdk.Duration.seconds(30),
});
// 授予 ECS 權限
restartServiceLambda.addToRolePolicy(
new iam.PolicyStatement({
actions: ['ecs:UpdateService', 'ecs:DescribeServices'],
resources: ['*'],
})
);
// 訂閱告警 Topic
this.alarmTopic.addSubscription(
new subscriptions.LambdaSubscription(restartServiceLambda)
);
}
}
/**
* CloudWatch Logs Insights 實用查詢
*/
// 查詢 1: 錯誤日誌統計
const errorStatsQuery = `
fields @timestamp, level, msg, err.message
| filter level = "error"
| stats count() as error_count by bin(5m)
| sort @timestamp desc
`;
// 查詢 2: 慢查詢偵測 (API 回應時間 > 1s)
const slowAPIQuery = `
fields @timestamp, msg, responseTime, method, url
| filter responseTime > 1000
| sort responseTime desc
| limit 20
`;
// 查詢 3: 5xx 錯誤分析
const error5xxQuery = `
fields @timestamp, statusCode, url, err.message, err.stack
| filter statusCode >= 500
| stats count() as error_count by statusCode, url
| sort error_count desc
`;
// 查詢 4: 用戶登入活動
const loginActivityQuery = `
fields @timestamp, userId, email, ipAddress, userAgent
| filter msg = "User logged in"
| stats count() as login_count by userId, ipAddress
| sort login_count desc
`;
// 查詢 5: API 端點使用統計
const apiUsageQuery = `
fields @timestamp, method, url, responseTime
| filter @message like /API request completed/
| stats count() as request_count, avg(responseTime) as avg_latency, pct(responseTime, 95) as p95_latency by concat(method, " ", url) as endpoint
| sort request_count desc
`;
// 查詢 6: Rate Limit 觸發
const rateLimitQuery = `
fields @timestamp, ipAddress, userId, endpoint
| filter msg = "Rate limit exceeded"
| stats count() as hit_count by ipAddress, endpoint
| sort hit_count desc
`;
// 查詢 7: OTP 發送失敗原因分析
const otpFailureQuery = `
fields @timestamp, phoneNumber, provider, err.message
| filter action = "OTP_SEND_FAILED"
| stats count() as failure_count by err.message
| sort failure_count desc
`;
// apps/kyo-otp-service/src/lib/metrics.ts
import {
CloudWatchClient,
PutMetricDataCommand,
MetricDatum,
StandardUnit,
} from '@aws-sdk/client-cloudwatch';
/**
* CloudWatch Metrics Publisher
*/
export class MetricsPublisher {
private client: CloudWatchClient;
private namespace: string;
private buffer: MetricDatum[] = [];
private flushTimer: NodeJS.Timeout | null = null;
constructor(namespace: string = 'Kyo/Application') {
this.client = new CloudWatchClient({});
this.namespace = namespace;
}
/**
* 發送計數指標
*/
putCount(metricName: string, value: number = 1, dimensions?: Record<string, string>) {
this.putMetric(metricName, value, StandardUnit.Count, dimensions);
}
/**
* 發送時間指標 (毫秒)
*/
putTime(metricName: string, milliseconds: number, dimensions?: Record<string, string>) {
this.putMetric(metricName, milliseconds, StandardUnit.Milliseconds, dimensions);
}
/**
* 發送百分比指標
*/
putPercent(metricName: string, percent: number, dimensions?: Record<string, string>) {
this.putMetric(metricName, percent, StandardUnit.Percent, dimensions);
}
/**
* 發送通用指標
*/
private putMetric(
metricName: string,
value: number,
unit: StandardUnit,
dimensions?: Record<string, string>
) {
const metric: MetricDatum = {
MetricName: metricName,
Value: value,
Unit: unit,
Timestamp: new Date(),
Dimensions: dimensions
? Object.entries(dimensions).map(([name, value]) => ({
Name: name,
Value: value,
}))
: undefined,
};
this.buffer.push(metric);
// 批次發送 (20 個或 1 秒)
if (this.buffer.length >= 20) {
this.flush();
} else {
this.scheduleFlush();
}
}
/**
* 排程批次發送
*/
private scheduleFlush() {
if (this.flushTimer) return;
this.flushTimer = setTimeout(() => {
this.flush();
this.flushTimer = null;
}, 1000);
}
/**
* 發送指標到 CloudWatch
*/
private async flush() {
if (this.buffer.length === 0) return;
const metrics = [...this.buffer];
this.buffer = [];
try {
await this.client.send(
new PutMetricDataCommand({
Namespace: this.namespace,
MetricData: metrics,
})
);
console.log(`Published ${metrics.length} metrics to CloudWatch`);
} catch (error) {
console.error('Failed to publish metrics:', error);
// 失敗時重新加入緩衝區
this.buffer.unshift(...metrics);
}
}
/**
* 關閉時清空緩衝區
*/
async close() {
if (this.flushTimer) {
clearTimeout(this.flushTimer);
this.flushTimer = null;
}
await this.flush();
}
}
// 單例
export const metrics = new MetricsPublisher();
// apps/kyo-otp-service/src/routes/otp.ts
import { metrics } from '../lib/metrics';
// OTP 發送
otpRouter.post('/send', async (request, reply) => {
const startTime = Date.now();
try {
const { phoneNumber } = request.body;
// 發送 OTP
await otpService.send(phoneNumber);
// 記錄成功指標
metrics.putCount('OTPSendSuccess', 1, {
Provider: 'mitake',
Country: 'TW',
});
// 記錄回應時間
metrics.putTime('OTPSendLatency', Date.now() - startTime);
return { success: true };
} catch (error) {
// 記錄失敗指標
metrics.putCount('OTPSendFailure', 1, {
Provider: 'mitake',
ErrorType: error.code || 'unknown',
});
throw error;
}
});
我們今天完成了 Kyo System 的 CloudWatch 完整監控:
三大支柱:
告警策略:
成本優化:
自動修復: